# Third-Party Imports
import torch
import openai
from sentence_transformers import SentenceTransformer, util
from evaluate import load
from datasets import load_dataset
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
import requests as req
import wikipediaapi
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
# Standard Imports
import os
import sys
import json
from string import punctuation
from math import log1p, inf
# Plotting functions
def plot_bar_data(*bars, x=None, title="", x_label="", y_label=""):
fig = go.Figure(
layout={
"title": title,
"xaxis": {"title": x_label},
"yaxis": {"title": y_label},
"barmode": "group"
}, data=[
go.Bar(name=f"{bar[0]}", x=x, y=bar[1])
for bar in bars
])
return fig
def create_bar(name, data):
return (name, data)
# Tokenization
def tokenize(doc, remove_stopwords=True):
banned = list(punctuation)
if remove_stopwords:
banned += nltk.corpus.stopwords.words("english")
return [
w.lower() for w in nltk.word_tokenize(doc)
if w.lower() not in banned
]
# DocSearcher Class
# Implementation of all NLP methods used by LAME for info extraction (incl. the WikiBot) in a single class
class DocSearcher():
def __init__(self):
self._corpus = dict()
self._file_matches = 2
self._sentence_matches = 1
self._sent_transformer = SentenceTransformer(
"sentence-transformers/all-MiniLM-L6-v2"
)
def view_corpus(self):
return self._corpus
def load_files(self, corpus):
self._corpus = corpus
def clear_files(self):
self._corpus = dict()
def search(self, query, s_method='tf-idf'):
fnames = self._corpus.keys()
if s_method == 'tf-idf':
joint_context, ranked_sents = self._context_and_sents_idf(query, fnames)
output_text = self._build_output_text(ranked_sents, inf)
answer = ' '.join(nltk.sent_tokenize(output_text)[:self._sentence_matches])
return answer
joint_context, ranked_sents = self._context_and_sents_cosine(query, fnames)
if s_method == "cosine_sim":
output_text = self._build_output_text(ranked_sents, inf)
answer = ' '.join(nltk.sent_tokenize(output_text)[:self._sentence_matches])
elif s_method == "bert":
output_text = self._build_output_text(ranked_sents, 2048)
answer = self._run_model_bert(query, output_text)
elif s_method == "openai":
output_text = self._build_output_text(ranked_sents, 2500)
answer = self._run_model_openai(query, output_text)
return answer.strip()
def _build_output_text(self, ranked_sents, max_length=512):
output_text = ''
for sent in ranked_sents:
new_sent = sent[0]
if len(nltk.word_tokenize(f'{output_text} {new_sent}')) <= max_length:
output_text += f' {new_sent}'
else:
break
return output_text
def _run_model_bert(self, query, context):
# Get API url and headers
api_url = "https://api-inference.huggingface.co/models/bert-large-uncased-whole-word-masking-finetuned-squad"
headers = {
"Authorization": f"Bearer {os.getenv('HUGGING_FACE_API_KEY')}"
}
payload = {
"inputs": {
"question": query,
"context": context
}
}
data = json.dumps(payload)
res = req.request("POST", api_url, headers=headers, data=data)
content = json.loads(res.content.decode("utf-8"))
answer = content.get("answer", None)
if not answer:
return f"Error: {content.get('error', 'Something is wrong')}"
return answer
def _run_model_openai(self, query, text):
openai.api_key = os.getenv("OPENAI_API_KEY")
res = openai.Completion.create(
model="text-davinci-003",
prompt=f"Context: {query} Query: {text}\n\nUsing only the context given, answer the query.",
temperature=0,
max_tokens=500,
)
return res.choices[0].text
def _context_and_sents_idf(self, query, fnames):
idfs = self._compute_idfs(fnames)
top_files = self._top_files_idf(query, idfs)
joint_context = "\n".join(self._corpus[name] for name in top_files)
ranked_sents = self._sent_rank_idf(query, joint_context, idfs)
return joint_context, ranked_sents
def _context_and_sents_cosine(self, query, fnames):
top_files = self._top_files_cosine(query, fnames)
joint_context = "\n".join(self._corpus[name] for name in top_files)
ranked_sents = self._sent_rank_cosine(query, joint_context)
return joint_context, ranked_sents
def _cosine_similarity(self, text_1, text_2, model):
embedding_1 = model.encode(text_1, convert_to_tensor=True)
embedding_2 = model.encode(text_2, convert_to_tensor=True)
return float(util.pytorch_cos_sim(embedding_1, embedding_2))
def _compute_idfs(self, fnames):
file_idfs = dict()
unique_words = set()
num_docs = len(fnames)
for name in fnames:
for sent in nltk.sent_tokenize(self._corpus[name]):
unique_words = set().union(
unique_words,
set(self._word_tokenize(sent))
)
for word in unique_words:
num_apps = sum(1 for name in fnames if word in self._corpus[name])
if num_apps > 0:
file_idfs[word] = log1p(num_docs / num_apps)
return file_idfs
def _top_files_idf(self, query, idfs):
tf_idfs = { fname: 0 for fname in self._corpus }
query = self._word_tokenize(query)
for w in query:
for fname in self._corpus:
tf_idfs[fname] += self._corpus[fname].count(w) * idfs.get(w, 0)
ranked_files = sorted(
tf_idfs.items(),
key=lambda x: x[1],
reverse=True
)
return [file[0] for file in ranked_files][:self._file_matches]
def _top_files_cosine(self, query, fnames):
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
ranked_files = sorted([
(name, self._cosine_similarity(query, self._corpus[name], model))
for name in fnames
], key=lambda x: x[1], reverse=True)
return [file[0] for file in ranked_files][:self._file_matches]
def _word_tokenize(self, words):
banned = list(punctuation) + nltk.corpus.stopwords.words("english")
return [
w.lower() for w in nltk.word_tokenize(words)
if w.lower() not in banned
]
def _sent_rank_idf(self, query, context, idfs):
query_set = set(self._word_tokenize(query))
sent_scores = { sent: [0,0] for sent in nltk.sent_tokenize(context)}
for sent in sent_scores:
sent_set = set(self._word_tokenize(sent))
common_words = query_set.intersection(sent_set)
sent_scores[sent][0] += sum(idfs.get(w, 0) for w in common_words)
sent_scores[sent][1] += len(common_words)
ranked_sents = sorted(
sent_scores.items(),
key=lambda x: (x[1][0], x[1][1]),
reverse=True
)
return [(sent, score[0]) for sent, score in ranked_sents]
def _sent_rank_cosine(self, query, context):
model = self._sent_transformer
sent_scores = {
sent: self._cosine_similarity(query, sent, model)
for sent in nltk.sent_tokenize(context)
}
ranked_sents = sorted(
sent_scores.items(),
key = lambda x: x[1],
reverse=True
)
return ranked_sents
def load_squad_data(subset_size=5):
indices = np.random.randint(0, 10570, (subset_size,))
squad = load_dataset(
"squad",
split="validation",
).select(indices)
return squad
def predict_sample(squad_sample, method):
"""
Run an info extraction method on a single example from
the SQuAD dataset.
"""
# Get relevant properties from squad sample
question = squad_sample["question"]
title = squad_sample["title"]
context = squad_sample["context"]
sample_id = squad_sample["id"]
# Initialise doc searcher
doc_searcher = DocSearcher()
# Build and load corpus for doc searcher
doc_searcher.load_files({title: context})
# Get predicted text
pred_text = doc_searcher.search(question, method)
doc_searcher.clear_files()
# Build prediction object
pred_obj = {"prediction_text": pred_text, "id": sample_id}
# Build reference object
ref_obj = {"answers": squad_sample["answers"]}
ref_obj["id"] = sample_id
return pred_obj, ref_obj
def predict_samples(squad_ds, method):
"""
Run an info extraction method on multiple examples from
the SQuAD dataset.
"""
# Initialse lists for storing prediction and reference objects
predictions = []
references = []
# Run method on all samples in dataset
for sample in squad_ds:
pred_obj, ref_obj = predict_sample(sample, method)
predictions.append(pred_obj)
references.append(ref_obj)
return predictions, references
def evaluate_method(squad_ds, method, squad_metric):
"""
Get the average exact match and F1 scores of an info
extraction method after running it on a subset of SQuAD
"""
# Get prediction and reference objects
preds, refs = predict_samples(squad_ds, method)
# Get results
results = squad_metric.compute(predictions=preds, references=refs)
return results
def visualise_results(results):
"""
Take results from the evaluate_method function
an create bar graphs to visualise them.
"""
method_labels = {
"tf-idf": "TF-IDF",
"bert": "BERT",
"openai": "OpenAI",
"cosine_sim": "Cosine Similarity"
}
plots = dict()
# Create plot for average scores
x = [method_labels[r["method"]] for r in results]
em_bar = create_bar("Average Exact Match Score", [r["avg_em"] for r in results])
f1_bar = create_bar("Average F1 Score", [r["avg_f1"] for r in results])
avg_score_plot = plot_bar_data(em_bar, f1_bar, x=x, title="Average Scores")
plots["average_score_plot"] = avg_score_plot
# Create plot for EM and F1 scores over multiple trials
for r in results:
x = [f"Sample #{i+1}" for i in range(len(r["f1_scores"]))]
em_bar = create_bar("Exact Match Score", r["em_scores"])
f1_bar = create_bar("F1 Score", r["f1_scores"])
new_plot = plot_bar_data(
em_bar,
f1_bar,
x=x,
title=f"Exact Match and F1 Scores for {method_labels[r['method']]}"
)
plots[f"{r['method']}_plot"] = new_plot
return plots
def method_evaluator(methods, num_trials=10, dataset_size=10):
"""
Evaluate several info extraction methods at once.
"""
# Initialise results object
results = [
{
"f1_scores": [],
"em_scores": [],
"method": m
}
for m in methods
]
# Load squad evaluator
squad_metric = load("squad")
for t in range(num_trials):
print(f"Trial #{t+1}")
squad_ds = load_squad_data(dataset_size)
for i, m in enumerate(methods):
result = evaluate_method(squad_ds, m, squad_metric)
results[i]["f1_scores"].append(result.get("f1", None))
results[i]["em_scores"].append(result.get("exact_match", None))
for i, _ in enumerate(results):
results[i]["avg_f1"] = np.mean(results[i]["f1_scores"])
results[i]["avg_em"] = np.mean(results[i]["em_scores"])
return results
# Get results of evaluation of each info extraction method
results = method_evaluator(["tf-idf", "cosine_sim", "bert", "openai"], 10, 10)
results
Trial #1
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #2
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #3
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #4
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #5
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #6
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #7
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #8
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #9
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
Trial #10
Found cached dataset squad (/Users/bhekimaenetja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
[{'f1_scores': [13.35003685003685,
11.878500861053205,
22.95218816271448,
21.09002569265053,
24.03466794643265,
15.938863698454492,
15.533238204679293,
18.12565464945294,
17.194956894956896,
16.67998429426143],
'em_scores': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
'method': 'tf-idf',
'avg_f1': 17.677811725469276,
'avg_em': 0.0},
{'f1_scores': [16.079617211196158,
10.902597402597403,
24.296992481203006,
21.09002569265053,
20.390145801910506,
14.60553036512116,
18.89377035545534,
24.038210562008853,
21.273499405078354,
16.67998429426143],
'em_scores': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
'method': 'cosine_sim',
'avg_f1': 18.825037357148275,
'avg_em': 0.0},
{'f1_scores': [10.0,
100.0,
80.0,
95.36842105263159,
88.57142857142858,
86.66666666666666,
76.66666666666666,
84.0,
20.0,
95.95238095238095],
'em_scores': [10.0, 100.0, 80.0, 80.0, 80.0, 80.0, 70.0, 80.0, 20.0, 80.0],
'method': 'bert',
'avg_f1': 73.72255639097745,
'avg_em': 68.0},
{'f1_scores': [37.91301831235688,
25.80210221876888,
31.531937087019845,
38.18693773623832,
20.827731767367304,
31.934221806562228,
15.13217071466713,
26.437768582884665,
42.5697309226721,
25.009489885629865],
'em_scores': [10.0, 10.0, 0.0, 0.0, 0.0, 10.0, 0.0, 10.0, 10.0, 0.0],
'method': 'openai',
'avg_f1': 29.53451090341672,
'avg_em': 5.0}]
# Get data visualisations of results
results_plots = visualise_results(results)
# Results for TF-IDF
results_plots["tf-idf_plot"]
# Results for cosine similarity
results_plots["cosine_sim_plot"]
# Results for BERT
results_plots["bert_plot"]
# Results for OpenAI
results_plots["openai_plot"]
# Average F1 and exact match scores for all methods
results_plots["average_score_plot"]